Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / freeWAIS-sf-1.1 / ir / weight.c < prev next >

Wrap

C/C++ Source or Header | 1994-05-31 | 7.0 KB | 218 lines

/* -*- Mode: C -*- * weight.c -- * ITIID : $ITI$ $Header $__Header$ * Author : Huynh Quoc T. Tung * Created On : Tue May 17 11:45:07 1994 * Last Modified By: Ulrich Pfeifer * Last Modified On: Mon May 30 17:13:10 1994 * Update Count : 35 * Status : Unknown, Use with caution! */ /********************* INTRODUCTION ************************ * The documents would be presented by term vectors of the form * D = (t_0,w_d0; t_1,w_d1; ..., t_t,w_dt) * where each t_k identifies a content term assigned to some sample * document and w_dk represents the weight of term t_k in Document D * (or query Q). Thus, a typical query Q might be formulated as * Q = (q_0,w_q0; q_1,w_q1; ...; q_t,w_qt) * where q_k once again reprents a term assigned to query Q. * The weights could be allowed to vary continuosly between 0 and 1, the * higher weight assignments near 1 being used for the most important terms, * whereas lower weights near 0 would characterize the less important terms. * Given the vector representation, a query-document similarity value may * be obtained by comparing the corresponding vectors, using for example * the conventional vector product formula * similarity(Q,D) = sum(w_qk * w_dk), k=1 to t. * * Three factors important for term_weighting: * 1) term frequency in individual document (recall) * 2) inverse document frequency (precision) * 3) document length (vector length) * * Term frequency component using here: new_wgt = 0.5 + 0.5 * tf / max_tf * augmented normalized term frequency (tf factor normalized by maximum tf * in the vector, and further normalized to lie between 0.5 and 1.0). * * Collection frequency component using here: 1.0 * no change in weight; use original term frequency component. * * Normalization component using here: sqrt(sum(new_wgt^2)) = vector length. * * Thus, document term weight is: w_dk = new_wgt / vector length * * By query term weighting it is assumpted that tf is equal 1. So that * w_qk = 1. * ****************************************************************************/ /********************* PROCEDURE DESCRIPTION ************************ * assign_term_weight_for_doc(max_tf, number_of_elements, bucket_ids_array, db) * long *max_tf; * long *number_of_elements; * long *bucket_ids_array; * database *db; * * computing weight and assigning it into the buf. * * write_weight_in_ptr(weight, ptr) * float weight; * char *ptr; * * assigning weight into the buf * * float read_weight_from_stream(new_weight_size, stream) * long new_weight_size; * FILE* stream; * * reading weight from stream by searching. return weight. * * void save_terms_of_doc(number_of_terms, db) * long *number_of_term; * database *db; * * save all terms of document before flushing into disk. * * void add_terms_saved(is_field, number_of_terms, doc_id, db) * boolean is_field; * long *number_of_terms; * long doc_id; * database *db; * * add all terms saved into hashtable. * ****************************************************************************/ #include "irfiles.h" #include "cutil.h" #ifdef NEW_WEIGHT #include "futil.h" #include "hash.h" #include "irhash.h" #include "weight.h" #include <math.h> long max_term_frequency = 0; /* used in irhash.c */ long bucket_ids_doc_array[DEFAULT_NUMBER_OF_BUCKETS + 1]; /* used in hash.c */ void write_weight_in_ptr(weight, ptr) float weight; unsigned char *ptr; { float tmp_weight[1]; tmp_weight[0] = weight; memcpy((unsigned char *)ptr, (unsigned char *)tmp_weight,NEW_WEIGHT_SIZE); } float read_weight_from_stream(new_weight_size, stream) long new_weight_size; FILE *stream; { float tmp; unsigned char *inc; int i; inc = (unsigned char*) &tmp; for (i=0; i<new_weight_size; i++) { *inc = fgetc(stream); inc++; } return(tmp); } void assign_term_weight_for_doc(number_of_elements, db) long *number_of_elements; database *db; { long i; long id; long tf; float new_wgt; float doc_len = 0.0; hashtable *htable = db->the_word_memory_hashtable; /* compute document length */ for(i=0; i< *number_of_elements; i++) { id = bucket_ids_doc_array[i]; tf = (htable->contents)[id].occurances_in_doc; doc_len += (0.5 + (0.5 * tf / max_term_frequency)) * (0.5 + (0.5 * tf / max_term_frequency)); } doc_len = sqrt(doc_len); for(i=0; i< *number_of_elements; i++) { id = bucket_ids_doc_array[i]; tf = (htable->contents)[id].occurances_in_doc; (htable->contents)[id].occurances_in_doc = 0; new_wgt = (0.5 + (0.5 * tf / max_term_frequency)) / doc_len; if ((htable->contents)[id].current_memory_ptr == NULL) { fprintf(stderr, "panic: assign_term_weight_for_doc current_memory_ptr == NULL\n"); } else { write_weight_in_ptr(new_wgt, (htable->contents)[id].current_memory_ptr - NEW_WEIGHT_SIZE); } } max_term_frequency = 0; *number_of_elements = 0; memset(bucket_ids_doc_array, 0, DEFAULT_NUMBER_OF_BUCKETS * sizeof(long)); } term_infotable *termtable = NULL; void save_terms_for_doc(number_of_terms, db) long *number_of_terms; database *db; { int info_size = CHARACTER_POSITION_SIZE; int cn_size = CHARACTER_POSITION_SIZE + NEW_WEIGHT_SIZE; long i, id, char_pos; hashtable *htable = db->the_word_memory_hashtable; if(*number_of_terms != 0) { if(termtable == NULL) termtable = (term_infotable *)s_malloc(sizeof(term_infotable) * *number_of_terms); if(termtable == NULL) panic("Out of memory"); } for(i=0; i < *number_of_terms; i++) { id = bucket_ids_doc_array[i]; if((termtable[i].term = (char*)s_malloc(sizeof(char) * (MAX_KEY_SIZE+1))) == NULL) panic("Out of memory"); strncpy(termtable[i].term, (htable->contents)[id].key, MAX_KEY_SIZE); termtable[i].char_pos = read_bytes_from_memory(CHARACTER_POSITION_SIZE, (htable->contents)[id].current_memory_ptr - cn_size); termtable[i].tf = (htable->contents)[id].occurances_in_doc; if((htable->contents)[id].memory_size - WORD_MEMORY_INIT_BLOCK_SIZE == 0) (htable->contents)[id].number_of_occurances = STOP_WORD_FLAG; else (htable->contents)[id].current_memory_ptr -= WORD_MEMORY_INIT_BLOCK_SIZE; } *number_of_terms = 0; } void add_terms_saved(is_field, number_of_terms, doc_id, db) boolean is_field; long *number_of_terms; long doc_id; database *db; { long i, id; long number_of_elements = *number_of_terms; *number_of_terms = 0; for(i=0; i < number_of_elements; i++) { if(is_field) field_add_word(termtable[i].term, termtable[i].char_pos,0,1,doc_id,0,0,db,false); else add_word(termtable[i].term, termtable[i].char_pos,0,1,doc_id,0,0,db,false); id = bucket_ids_doc_array[i]; (db->the_word_memory_hashtable->contents)[id].occurances_in_doc = termtable[i].tf; if(termtable[i].term != NULL) s_free(termtable[i].term); } if(termtable != NULL) s_free(termtable); } #endif /* NEW_WEIGHT */